#!/usr/bin/env python3
'''
Author: Paschalis Agapitos
Project: Mestizajes

Builds a directed Wikipedia biographies network by combining the global link
graph with human-specific metadata, enriching nodes with temporal, gender,
and notability-related attributes, and exporting the resulting structured
graph data for downstream connectivity and historical analyses.
'''

from pathlib import Path
from typing import Union

import pandas as pd
import networkx as nx
from connecting_people.preprocessing import get_century, gender


def build_connecting_people_graph(
    input_file_path: Union[str, Path],
    output_file_path: Union[str, Path],
) -> nx.DiGraph:
    """
    build a directed Wikipedia biographies graph with biographical metadata.

    this function:
    - loads the global Wikipedia graph, an id–title mapping, and the humans metadata table
    - restricts the graph to nodes that correspond to human biographies
    - attaches birth/death year and century, gender, and notability information
    - adds isolated human nodes (biographies with no edges in the graph)
    - writes an edge list with associated metadata to a parquet file
    - returns the resulting NetworkX DiGraph for further analysis in a notebook

    parameters
    ----------
    input_file_path:
        base data directory that contains the following relative paths:
        - "en/graph/en_wiki_graph.parquet"
        - "en/graph/en_id_node_mapping.parquet"
        - "person_results/EN/2025/en_humans_2025_notability_features_upd.parquet"
    output_file_path:
        path to the parquet file where the enriched edge list will be written.

    returns
    -------
    nx.DiGraph
        directed graph of human biographies with node attributes
        (dates, centuries, notability features, activity field, gender).
    """
    base_dir = Path(input_file_path)
    output_file_path = Path(output_file_path)

    # infer all required input paths from the base directory
    graph_path = base_dir / "en/graph/en_wiki_graph.parquet"
    humans_path = base_dir / "person_results/EN/2025/en_humans_2025_notability_features_v2.parquet"
    mapping_df_path = base_dir / "en/graph/en_id_node_mapping.parquet"

    # load graph edge list and id→title mapping
    graph_2025 = pd.read_parquet(graph_path, engine="pyarrow")
    mapping_df = pd.read_parquet(mapping_df_path)

    # map numeric node ids to Wikipedia titles
    id_to_label = dict(zip(mapping_df["id"], mapping_df["label"]))
    graph_2025["Source"] = graph_2025["Source"].map(id_to_label)
    graph_2025["Target"] = graph_2025["Target"].map(id_to_label)

    # load humans metadata table
    humans = pd.read_parquet(humans_path, engine="pyarrow")

    # identify nodes that correspond to biographies
    graph_nodes_list = set(graph_2025["Source"]).union(set(graph_2025["Target"]))
    bios_only_nodes = set(humans["en_wikipedia_title"]) & graph_nodes_list

    # restrict the global graph to biography nodes only
    subgraph_edges = graph_2025[
        graph_2025["Source"].isin(bios_only_nodes)
        & graph_2025["Target"].isin(bios_only_nodes)
    ]

    # build a directed NetworkX graph from the filtered edges
    humans_graph = nx.from_pandas_edgelist(
        subgraph_edges,
        source="Source",
        target="Target",
        create_using=nx.DiGraph,
    )

    # convert the subgraph to a dataframe for merging with metadata
    humans_graph_df = nx.to_pandas_edgelist(
        humans_graph,
        source="Source",
        target="Target",
    )

    # merge source-node metadata (birth/death dates, notability, activity field, gender)
    humans_graph_df = humans_graph_df.merge(
        humans[
            [
                "en_wikipedia_title",
                "gender",
                "date_of_birth",
                "date_of_death",
                "place_of_birth",
                "place_of_death",
                "notability_features",
                "field_of_human_activity",
            ]
        ],
        how="left",
        left_on="Source",
        right_on="en_wikipedia_title",
        suffixes=("", "_S"),
    )

    # extract birth and death years and centuries for the source nodes
    humans_graph_df["YOB_S"] = humans_graph_df["date_of_birth"].apply(
        get_century.extract_year
    )
    humans_graph_df["YOD_S"] = humans_graph_df["date_of_death"].apply(
        get_century.extract_year
    )
    humans_graph_df["YOB_S"] = pd.to_numeric(
        humans_graph_df["YOB_S"], errors="coerce"
    )
    humans_graph_df["YOD_S"] = pd.to_numeric(
        humans_graph_df["YOD_S"], errors="coerce"
    )
    humans_graph_df["COB_S"] = humans_graph_df["YOB_S"].apply(
        lambda x: get_century.get_centuries(x) if not pd.isna(x) else None
    )
    humans_graph_df["COD_S"] = humans_graph_df["YOD_S"].apply(
        lambda x: get_century.get_centuries(x) if not pd.isna(x) else None
    )

    # identify isolated biography nodes (no edges in the restricted graph)
    isolates = list(nx.isolates(humans_graph))
    isolates_df = pd.DataFrame(isolates, columns=["Source"])

    # attach metadata to isolates as well
    isolates_df = isolates_df.merge(
        humans[
            [
                "en_wikipedia_title",
                "gender",
                "date_of_birth",
                "date_of_death",
                "place_of_birth",
                "place_of_death",
                "notability_features",
                "field_of_human_activity",
            ]
        ],
        how="left",
        left_on="Source",
        right_on="en_wikipedia_title",
    )

    # extract birth and death years and centuries for isolates
    isolates_df["YOB_S"] = isolates_df["date_of_birth"].apply(
        get_century.extract_year
    )
    isolates_df["YOD_S"] = isolates_df["date_of_death"].apply(
        get_century.extract_year
    )
    isolates_df["YOB_S"] = pd.to_numeric(isolates_df["YOB_S"], errors="coerce")
    isolates_df["YOD_S"] = pd.to_numeric(isolates_df["YOD_S"], errors="coerce")
    isolates_df["COB_S"] = isolates_df["YOB_S"].apply(
        lambda x: get_century.get_centuries(x) if not pd.isna(x) else None
    )
    isolates_df["COD_S"] = isolates_df["YOD_S"].apply(
        lambda x: get_century.get_centuries(x) if not pd.isna(x) else None
    )

    # combine edge-based graph and isolates into a single dataframe
    upd_humans_graph_df = pd.concat(
        [humans_graph_df, isolates_df],
        ignore_index=True,
    )

    # clean and encode gender information
    upd_humans_graph_df["gender"] = (
        upd_humans_graph_df["gender"]
        .astype(str)
        .str.replace(r'[\[\]""]', "", regex=True)
    )
    upd_humans_graph_df["gender"] = upd_humans_graph_df["gender"].map(
        gender.gender_to_category
    )

    # keep a consistent column order for the exported edge list
    upd_humans_graph_df = upd_humans_graph_df[
        [
            "Source",
            "date_of_birth",
            "YOB_S",
            "COB_S",
            "place_of_birth",
            "date_of_death",
            "YOD_S",
            "COD_S",
            "place_of_death",
            "notability_features",
            "field_of_human_activity",
            "gender",
            "Target",
        ]
    ]

    # remove rows without an assigned activity field
    upd_humans_graph_df = upd_humans_graph_df[
        ~upd_humans_graph_df["field_of_human_activity"].isna()
    ]

    # place_of_birth and place_of_death are not kept as node attributes in this export
    upd_humans_graph_df = upd_humans_graph_df.drop(
        ["place_of_birth", "place_of_death"],
        axis=1,
    )

    # precompute a dictionary of node-level metadata for faster graph construction
    source_metadata = {
        row["Source"]: {
            "date_of_birth": row["date_of_birth"],
            "year_of_birth": row["YOB_S"],
            "century_of_birth": row["COB_S"],
            "date_of_death": row["date_of_death"],
            "year_of_death": row["YOD_S"],
            "century_of_death": row["COD_S"],
            "red_occups": row["notability_features"],
            "cluster": row["field_of_human_activity"],
            "gender": row["gender"],
        }
        for _, row in upd_humans_graph_df.iterrows()
    }

    sources_set = set(source_metadata.keys())

    # create an empty directed graph for the final network
    G_upd = nx.DiGraph()

    # add edges and ensure both endpoints are present as nodes with metadata
    for _, row in upd_humans_graph_df.iterrows():
        source_node = row["Source"]
        target_node = row["Target"]

        if source_node not in G_upd:
            G_upd.add_node(source_node, **source_metadata[source_node])

        if target_node not in G_upd:
            metadata = source_metadata.get(target_node, {})
            G_upd.add_node(target_node, **metadata)

        G_upd.add_edge(source_node, target_node)

    # add isolated nodes that might not have appeared as endpoints
    isolates_set = set(isolates_df["Source"])
    all_nodes = sources_set.union(isolates_set)

    for node in all_nodes:
        if node not in G_upd:
            metadata = source_metadata.get(node, {})
            G_upd.add_node(node, **metadata)

    # write the enriched edge list to a parquet file for downstream analysis
    upd_humans_graph_df.to_parquet(
        output_file_path,
        engine="pyarrow",
        compression="gzip",
    )

    return G_upd